In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [1258]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected BABA
In [1259]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [1260]:
pd.set_option('display.max_colwidth', None)
In [1261]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [1262]:
del df['Unnamed: 0']
In [1263]:
df.head(5)
Out[1263]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2020-09-25 267.570007 271.809998 264.559998 271.089996 271.089996 11466600 0.504202 0.679774 4.000926 7.145113 277.001897 269.000971 273.001434 NaN 2.635698 7.250000 34.133663 NaN NaN NaN -0.519989 NaN -0.001914 46.558973 NaN NaN 45.216111 47.318064 8.236911e+07 6.869274e+06 34426500.0 9.135463e+05 5.819177e+05 9.135463e+05 9.135463e+05 9.135463e+05 0.0 9.135463e+05 5.819177e+05 5.819177e+05 5.819177e+05 0.0 5.819177e+05 0 85 85 85 85 0 85 0
1 2020-09-28 275.529999 278.839996 274.199005 276.010010 276.010010 8761700 1.814900 1.063613 4.236484 7.231525 277.159408 268.926320 273.042864 NaN 2.620023 7.750000 1.953963 NaN NaN NaN 1.910004 NaN 0.006968 51.020053 NaN NaN 52.841655 48.031546 8.044538e+07 6.674414e+06 43188200.0 4.330681e+06 2.758590e+06 4.330681e+06 4.330681e+06 4.330681e+06 0.0 4.330681e+06 2.758590e+06 2.758590e+06 2.758590e+06 0.0 2.758590e+06 3 316 319 319 319 0 319 0
2 2020-09-29 275.429993 279.299988 274.899994 276.929993 276.929993 7673300 0.333315 1.093866 5.920861 6.827021 278.555141 268.822013 273.688577 -4.280573 2.465251 4.399994 3.927125 NaN NaN NaN -1.029999 -1.535394 -0.003706 51.829786 NaN NaN 71.104088 56.387285 7.985245e+07 5.799274e+06 50861500.0 2.047828e+06 1.304441e+06 2.047828e+06 2.047828e+06 2.047828e+06 0.0 2.047828e+06 1.304441e+06 1.304441e+06 1.304441e+06 0.0 1.304441e+06 3 321 324 324 324 0 324 0
3 2020-09-30 284.010010 295.000000 283.709991 293.980011 293.980011 24777700 6.156797 0.688753 56.444319 8.433162 291.594464 261.542691 276.568577 -3.314676 2.868617 18.070007 46.666583 NaN NaN NaN 15.839996 -1.185999 0.056950 63.780543 NaN NaN 86.917557 70.287767 1.001531e+08 1.137335e+07 75639200.0 9.212067e+06 5.867973e+06 9.212067e+06 9.212067e+06 9.212067e+06 0.0 9.212067e+06 5.867973e+06 5.867973e+06 5.867973e+06 0.0 5.867973e+06 16 835 851 851 851 0 851 0
4 2020-10-01 295.260010 295.589996 288.250000 290.049988 290.049988 16304000 -1.336833 0.665559 77.728785 8.276995 296.309932 261.044360 278.677146 -2.462945 2.853644 7.339996 47.686899 NaN NaN NaN 14.329987 -0.880757 0.051973 60.080512 NaN NaN 87.572263 81.864636 9.184556e+07 9.976408e+06 59335200.0 6.850694e+06 4.363808e+06 6.850694e+06 6.850694e+06 6.850694e+06 0.0 6.850694e+06 4.363808e+06 4.363808e+06 4.363808e+06 0.0 4.363808e+06 7 623 630 630 630 0 630 0
In [1264]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       311 non-null    datetime64[ns]
 1   Open                       311 non-null    float64       
 2   High                       311 non-null    float64       
 3   Low                        311 non-null    float64       
 4   Close                      311 non-null    float64       
 5   Adj Close                  311 non-null    float64       
 6   Volume                     311 non-null    int64         
 7   Return                     311 non-null    float64       
 8   Beta                       311 non-null    float64       
 9   Variance                   311 non-null    float64       
 10  AvgTrueRange               311 non-null    float64       
 11  Upperband                  311 non-null    float64       
 12  Lowerband                  311 non-null    float64       
 13  Middleband                 311 non-null    float64       
 14  APO                        309 non-null    float64       
 15  NATR                       311 non-null    float64       
 16  TRANGE                     311 non-null    float64       
 17  DMI                        311 non-null    float64       
 18  MACD                       301 non-null    float64       
 19  MACDSIGNAL                 301 non-null    float64       
 20  MACDHIST                   301 non-null    float64       
 21  MOM                        311 non-null    float64       
 22  PPO                        309 non-null    float64       
 23  ROCP                       311 non-null    float64       
 24  RSI                        311 non-null    float64       
 25  TRIX                       246 non-null    float64       
 26  ULTOSC                     306 non-null    float64       
 27  SLOWK                      311 non-null    float64       
 28  SLOWD                      311 non-null    float64       
 29  AD                         311 non-null    float64       
 30  ADOSC                      311 non-null    float64       
 31  OBV                        311 non-null    float64       
 32  Upward_momentum_created    311 non-null    float64       
 33  Downward_momentum_created  311 non-null    float64       
 34  B5_O_Um                    311 non-null    float64       
 35  B5_C_Um                    311 non-null    float64       
 36  B5_E_Um                    311 non-null    float64       
 37  B5_A_Um                    311 non-null    float64       
 38  B5_N_Um                    311 non-null    float64       
 39  B5_O_Dm                    311 non-null    float64       
 40  B5_C_Dm                    311 non-null    float64       
 41  B5_E_Dm                    311 non-null    float64       
 42  B5_A_Dm                    311 non-null    float64       
 43  B5_N_Dm                    311 non-null    float64       
 44  Verified_status_True       311 non-null    int64         
 45  Verified_status_False      311 non-null    int64         
 46  O                          311 non-null    int64         
 47  C                          311 non-null    int64         
 48  E                          311 non-null    int64         
 49  A                          311 non-null    int64         
 50  N                          311 non-null    int64         
 51  Real_or_Fake_tweet         311 non-null    int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 126.5 KB
In [1265]:
df.shape
Out[1265]:
(311, 52)
In [1266]:
sns.set(font_scale=0.8)
In [1267]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [1268]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [1269]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [1270]:
df.head()
Out[1270]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2020-09-25 267.570007 271.809998 264.559998 271.089996 271.089996 11466600 0.504202 0.679774 4.000926 7.145113 277.001897 269.000971 273.001434 NaN 2.635698 7.250000 34.133663 NaN NaN NaN -0.519989 NaN -0.001914 46.558973 NaN NaN 45.216111 47.318064 8.236911e+07 6.869274e+06 34426500.0 9.135463e+05 5.819177e+05 9.135463e+05 9.135463e+05 9.135463e+05 0.0 9.135463e+05 5.819177e+05 5.819177e+05 5.819177e+05 0.0 5.819177e+05 0 85 85 85 85 0 85 0 NaN NaN
1 2020-09-28 275.529999 278.839996 274.199005 276.010010 276.010010 8761700 1.814900 1.063613 4.236484 7.231525 277.159408 268.926320 273.042864 NaN 2.620023 7.750000 1.953963 NaN NaN NaN 1.910004 NaN 0.006968 51.020053 NaN NaN 52.841655 48.031546 8.044538e+07 6.674414e+06 43188200.0 4.330681e+06 2.758590e+06 4.330681e+06 4.330681e+06 4.330681e+06 0.0 4.330681e+06 2.758590e+06 2.758590e+06 2.758590e+06 0.0 2.758590e+06 3 316 319 319 319 0 319 0 1.814900 0.017986
2 2020-09-29 275.429993 279.299988 274.899994 276.929993 276.929993 7673300 0.333315 1.093866 5.920861 6.827021 278.555141 268.822013 273.688577 -4.280573 2.465251 4.399994 3.927125 NaN NaN NaN -1.029999 -1.535394 -0.003706 51.829786 NaN NaN 71.104088 56.387285 7.985245e+07 5.799274e+06 50861500.0 2.047828e+06 1.304441e+06 2.047828e+06 2.047828e+06 2.047828e+06 0.0 2.047828e+06 1.304441e+06 1.304441e+06 1.304441e+06 0.0 1.304441e+06 3 321 324 324 324 0 324 0 0.333315 0.003328
3 2020-09-30 284.010010 295.000000 283.709991 293.980011 293.980011 24777700 6.156797 0.688753 56.444319 8.433162 291.594464 261.542691 276.568577 -3.314676 2.868617 18.070007 46.666583 NaN NaN NaN 15.839996 -1.185999 0.056950 63.780543 NaN NaN 86.917557 70.287767 1.001531e+08 1.137335e+07 75639200.0 9.212067e+06 5.867973e+06 9.212067e+06 9.212067e+06 9.212067e+06 0.0 9.212067e+06 5.867973e+06 5.867973e+06 5.867973e+06 0.0 5.867973e+06 16 835 851 851 851 0 851 0 6.156797 0.059747
4 2020-10-01 295.260010 295.589996 288.250000 290.049988 290.049988 16304000 -1.336833 0.665559 77.728785 8.276995 296.309932 261.044360 278.677146 -2.462945 2.853644 7.339996 47.686899 NaN NaN NaN 14.329987 -0.880757 0.051973 60.080512 NaN NaN 87.572263 81.864636 9.184556e+07 9.976408e+06 59335200.0 6.850694e+06 4.363808e+06 6.850694e+06 6.850694e+06 6.850694e+06 0.0 6.850694e+06 4.363808e+06 4.363808e+06 4.363808e+06 0.0 4.363808e+06 7 623 630 630 630 0 630 0 -1.336833 -0.013458
In [1271]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [1272]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [1273]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [1274]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [1275]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [1276]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [1277]:
df.describe()
Out[1277]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 246.000000 246.000000 246.000000 246.000000 246.000000 2.460000e+02 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 246.0 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 246.0 2.460000e+02 246.000000 246.000000 246.000000 246.000000 246.000000 246.0 246.000000 246.0 246.000000 246.000000 217.000000 210.000000
mean 202.102943 204.345590 199.198403 201.592561 201.592561 2.094423e+07 -0.207049 0.839317 32.319721 6.502616 212.941723 193.082214 203.011969 -3.808441 3.325959 6.390142 31.200753 -3.624506 -3.582274 -0.042233 -5.192805 -2.017674 -0.024894 43.425184 -0.233060 45.409688 43.950114 43.867621 -4.813950e+07 -4.653100e+06 -3.650872e+08 6.650605e+06 4.236354e+06 6.650605e+06 6.650605e+06 6.650605e+06 0.0 6.650605e+06 4.236354e+06 4.236354e+06 4.236354e+06 0.0 4.236354e+06 9.170732 673.873984 683.044715 683.044715 683.044715 0.0 683.044715 0.0 -0.207049 -0.002437 0.025311 0.025219
std 40.187126 40.248797 39.962360 40.042940 40.042940 1.263946e+07 2.702073 0.263724 39.671457 1.678172 39.477287 39.109658 38.899957 7.426823 0.942754 3.255811 21.577145 4.322615 3.780045 1.797372 15.843726 3.973893 0.082139 9.940103 0.117800 8.602399 24.208167 22.293852 1.094812e+08 1.293811e+07 2.001141e+08 8.942137e+06 5.696031e+06 8.942137e+06 8.942137e+06 8.942137e+06 0.0 8.942137e+06 5.696031e+06 5.696031e+06 5.696031e+06 0.0 5.696031e+06 9.209762 485.568983 492.925695 492.925695 492.925695 0.0 492.925695 0.0 2.702073 0.027033 0.005847 0.005913
min 113.639999 118.190002 108.699997 111.959999 111.959999 7.650300e+06 -11.127612 -0.198108 1.755223 3.256017 127.139608 110.961141 122.082857 -18.518204 1.540799 1.800003 0.695081 -11.778272 -9.969750 -3.832131 -45.939995 -12.566746 -0.272754 19.124675 -0.518539 21.514220 2.065251 3.308153 -3.207712e+08 -5.665548e+07 -8.487745e+08 1.427748e+06 9.094581e+05 1.427748e+06 1.427748e+06 1.427748e+06 0.0 1.427748e+06 9.094581e+05 9.094581e+05 9.094581e+05 0.0 9.094581e+05 0.000000 249.000000 252.000000 252.000000 252.000000 0.0 252.000000 0.0 -11.127612 -0.117969 0.014203 0.014203
25% 167.340004 169.092495 165.471249 166.862507 166.862507 1.335935e+07 -1.641180 0.682664 8.341116 5.469164 176.637416 157.830649 166.952859 -8.856619 2.710870 4.082493 14.466935 -6.710149 -6.767930 -1.106504 -15.175007 -4.199853 -0.074622 36.238061 -0.306268 39.802147 21.287765 23.050531 -1.065624e+08 -1.155190e+07 -5.232798e+08 3.029066e+06 1.929478e+06 3.029066e+06 3.029066e+06 3.029066e+06 0.0 3.029066e+06 1.929478e+06 1.929478e+06 1.929478e+06 0.0 1.929478e+06 3.000000 373.000000 377.750000 377.750000 377.750000 0.0 377.750000 0.0 -1.641180 -0.016548 0.021871 0.021783
50% 212.400002 213.945000 209.994995 211.550003 211.550003 1.722295e+07 -0.385199 0.804448 19.042795 6.164824 219.128853 206.451059 212.487859 -4.251988 3.253407 5.614998 26.945775 -4.307832 -3.879066 -0.024890 -5.349991 -2.030155 -0.024919 42.439374 -0.194787 44.963862 45.237368 45.677399 -6.170477e+07 -5.991052e+06 -3.041592e+08 4.143884e+06 2.639603e+06 4.143884e+06 4.143884e+06 4.143884e+06 0.0 4.143884e+06 2.639603e+06 2.639603e+06 2.639603e+06 0.0 2.639603e+06 6.000000 501.500000 508.500000 508.500000 508.500000 0.0 508.500000 0.0 -0.385199 -0.003859 0.025276 0.024920
75% 230.672493 235.244999 228.044998 230.889999 230.889999 2.371040e+07 1.309460 0.982070 36.798831 7.332456 242.479356 222.536910 232.933211 0.685160 3.805710 7.416000 43.807030 -0.461419 -1.030653 0.920266 5.184998 0.323749 0.023818 50.042911 -0.149140 51.273131 63.424474 62.639209 2.550618e+07 8.951113e+05 -2.012475e+08 7.026527e+06 4.475811e+06 7.026527e+06 7.026527e+06 7.026527e+06 0.0 7.026527e+06 4.475811e+06 4.475811e+06 4.475811e+06 0.0 4.475811e+06 11.000000 809.750000 818.250000 818.250000 818.250000 0.0 818.250000 0.0 1.309460 0.013010 0.028740 0.028737
max 271.000000 274.290009 270.399994 270.829987 270.829987 8.869920e+07 10.396570 1.608871 260.397922 11.844461 277.221034 263.981180 268.157144 14.507949 6.206293 21.600006 90.473821 5.110634 4.420525 4.765163 38.550018 7.487299 0.236552 68.292867 -0.049300 68.355914 90.889135 89.175297 1.625807e+08 3.474256e+07 -5.681090e+07 9.093028e+07 5.792147e+07 9.093028e+07 9.093028e+07 9.093028e+07 0.0 9.093028e+07 5.792147e+07 5.792147e+07 5.792147e+07 0.0 5.792147e+07 58.000000 2896.000000 2925.000000 2925.000000 2925.000000 0.0 2925.000000 0.0 10.396570 0.098909 0.040412 0.040412
In [1278]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1279]:
df = df.fillna(df.median())
In [1280]:
df.isna().sum()
Out[1280]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1281]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 65 to 310
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       246 non-null    datetime64[ns]
 1   Open                       246 non-null    float64       
 2   High                       246 non-null    float64       
 3   Low                        246 non-null    float64       
 4   Close                      246 non-null    float64       
 5   Adj Close                  246 non-null    float64       
 6   Volume                     246 non-null    int64         
 7   Return                     246 non-null    float64       
 8   Beta                       246 non-null    float64       
 9   Variance                   246 non-null    float64       
 10  AvgTrueRange               246 non-null    float64       
 11  Upperband                  246 non-null    float64       
 12  Lowerband                  246 non-null    float64       
 13  Middleband                 246 non-null    float64       
 14  APO                        246 non-null    float64       
 15  NATR                       246 non-null    float64       
 16  TRANGE                     246 non-null    float64       
 17  DMI                        246 non-null    float64       
 18  MACD                       246 non-null    float64       
 19  MACDSIGNAL                 246 non-null    float64       
 20  MACDHIST                   246 non-null    float64       
 21  MOM                        246 non-null    float64       
 22  PPO                        246 non-null    float64       
 23  ROCP                       246 non-null    float64       
 24  RSI                        246 non-null    float64       
 25  TRIX                       246 non-null    float64       
 26  ULTOSC                     246 non-null    float64       
 27  SLOWK                      246 non-null    float64       
 28  SLOWD                      246 non-null    float64       
 29  AD                         246 non-null    float64       
 30  ADOSC                      246 non-null    float64       
 31  OBV                        246 non-null    float64       
 32  Upward_momentum_created    246 non-null    float64       
 33  Downward_momentum_created  246 non-null    float64       
 34  B5_O_Um                    246 non-null    float64       
 35  B5_C_Um                    246 non-null    float64       
 36  B5_E_Um                    246 non-null    float64       
 37  B5_A_Um                    246 non-null    float64       
 38  B5_N_Um                    246 non-null    float64       
 39  B5_O_Dm                    246 non-null    float64       
 40  B5_C_Dm                    246 non-null    float64       
 41  B5_E_Dm                    246 non-null    float64       
 42  B5_A_Dm                    246 non-null    float64       
 43  B5_N_Dm                    246 non-null    float64       
 44  Verified_status_True       246 non-null    int64         
 45  Verified_status_False      246 non-null    int64         
 46  O                          246 non-null    int64         
 47  C                          246 non-null    int64         
 48  E                          246 non-null    int64         
 49  A                          246 non-null    int64         
 50  N                          246 non-null    int64         
 51  Fake_news                  246 non-null    int64         
 52  returns                    246 non-null    float64       
 53  log_returns                246 non-null    float64       
 54  vol_current                246 non-null    float64       
 55  vol_future                 246 non-null    float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 109.5 KB
In [1282]:
df.shape
Out[1282]:
(246, 56)
In [1283]:
df=df.dropna()
In [1284]:
df.dtypes
Out[1284]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1285]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[1285]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f077ced6110>
In [1286]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [1287]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 5 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
NATR            0.609427
TRANGE          0.578297
AD              0.538605
Variance        0.500639
Name: AvgTrueRange, dtype: float64
In [1288]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with NATR :
NATR                     1.000000
vol_current              0.790280
vol_future               0.681911
AvgTrueRange             0.609427
Volume                   0.607467
Verified_status_False    0.569793
E                        0.569647
C                        0.569647
O                        0.569647
N                        0.569647
High                    -0.526910
Middleband              -0.529538
Open                    -0.537415
Adj Close               -0.537994
Close                   -0.537994
APO                     -0.539999
MACDSIGNAL              -0.547703
Low                     -0.553927
OBV                     -0.559170
PPO                     -0.590124
Lowerband               -0.593069
MACD                    -0.596103
Name: NATR, dtype: float64
In [1289]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 9 strongly correlated values with TRANGE:
TRANGE                   1.000000
Volume                   0.705909
N                        0.686315
E                        0.686315
C                        0.686315
O                        0.686315
Verified_status_False    0.684807
Verified_status_True     0.627706
AvgTrueRange             0.578297
Name: TRANGE, dtype: float64
In [1290]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with Openness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999936
Volume                       0.897375
Verified_status_True         0.802176
TRANGE                       0.686315
NATR                         0.569647
B5_N_Dm                      0.569241
B5_E_Dm                      0.569241
B5_C_Dm                      0.569241
Downward_momentum_created    0.569241
B5_O_Dm                      0.569241
B5_N_Um                      0.569241
B5_E_Um                      0.569241
B5_C_Um                      0.569241
B5_O_Um                      0.569241
Upward_momentum_created      0.569241
Name: O, dtype: float64
In [1291]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with conscientiousness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999936
Volume                       0.897375
Verified_status_True         0.802176
TRANGE                       0.686315
NATR                         0.569647
B5_N_Dm                      0.569241
B5_E_Dm                      0.569241
B5_C_Dm                      0.569241
Downward_momentum_created    0.569241
B5_O_Dm                      0.569241
B5_N_Um                      0.569241
B5_E_Um                      0.569241
B5_C_Um                      0.569241
B5_O_Um                      0.569241
Upward_momentum_created      0.569241
Name: C, dtype: float64
In [1292]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with conscientiousness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999936
Volume                       0.897375
Verified_status_True         0.802176
TRANGE                       0.686315
NATR                         0.569647
B5_N_Dm                      0.569241
B5_E_Dm                      0.569241
B5_C_Dm                      0.569241
Downward_momentum_created    0.569241
B5_O_Dm                      0.569241
B5_N_Um                      0.569241
B5_E_Um                      0.569241
B5_C_Um                      0.569241
B5_O_Um                      0.569241
Upward_momentum_created      0.569241
Name: E, dtype: float64
In [1293]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1294]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with conscientiousness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999936
Volume                       0.897375
Verified_status_True         0.802176
TRANGE                       0.686315
NATR                         0.569647
B5_N_Dm                      0.569241
B5_E_Dm                      0.569241
B5_C_Dm                      0.569241
Downward_momentum_created    0.569241
B5_O_Dm                      0.569241
B5_N_Um                      0.569241
B5_E_Um                      0.569241
B5_C_Um                      0.569241
B5_O_Um                      0.569241
Upward_momentum_created      0.569241
Name: N, dtype: float64
In [1295]:
df.columns
Out[1295]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [1296]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_O_Um:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_O_Um, dtype: float64
In [1297]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_C_Um:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_C_Um, dtype: float64
In [1298]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_E_Um:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_E_Um, dtype: float64
In [1299]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1300]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_N_Um:
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_N_Um, dtype: float64

Downward momentum correlation

In [1301]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_O_Dm:
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_O_Dm, dtype: float64
In [1302]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_C_Dm:
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_C_Dm, dtype: float64
In [1303]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_E_Dm:
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_E_Dm, dtype: float64
In [1304]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1305]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with B5_N_Dm:
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: B5_N_Dm, dtype: float64
In [1306]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Real_or_Fake_tweet :
Series([], Name: Fake_news, dtype: float64)
In [1307]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Downward_momentum_created :
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: Downward_momentum_created, dtype: float64
In [1308]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Upward_momentum_created :
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Um                      1.000000
B5_E_Um                      1.000000
B5_C_Um                      1.000000
B5_O_Um                      1.000000
Upward_momentum_created      1.000000
Verified_status_True         0.693323
N                            0.569241
E                            0.569241
C                            0.569241
O                            0.569241
Verified_status_False        0.564715
Name: Upward_momentum_created, dtype: float64
In [1309]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
N                            0.802176
E                            0.802176
C                            0.802176
O                            0.802176
Verified_status_False        0.795363
Volume                       0.725163
B5_N_Um                      0.693323
B5_E_Um                      0.693323
B5_C_Um                      0.693323
B5_O_Um                      0.693323
Upward_momentum_created      0.693323
B5_E_Dm                      0.693323
B5_C_Dm                      0.693323
B5_N_Dm                      0.693323
Downward_momentum_created    0.693323
B5_O_Dm                      0.693323
TRANGE                       0.627706
Name: Verified_status_True, dtype: float64
In [1310]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
N                            0.999936
E                            0.999936
C                            0.999936
O                            0.999936
Volume                       0.897216
Verified_status_True         0.795363
TRANGE                       0.684807
NATR                         0.569793
B5_N_Dm                      0.564715
B5_E_Dm                      0.564715
B5_C_Dm                      0.564715
B5_N_Um                      0.564715
B5_E_Um                      0.564715
B5_C_Um                      0.564715
B5_O_Um                      0.564715
Downward_momentum_created    0.564715
Upward_momentum_created      0.564715
B5_O_Dm                      0.564715
Name: Verified_status_False, dtype: float64
In [1311]:
sns.set(font_scale=0.8)
In [1312]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [1313]:
df.dtypes
Out[1313]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [1314]:
df.isnull().sum()
Out[1314]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [1315]:
df.fillna(0, inplace = True)
In [1316]:
df.dropna(inplace=True)
In [1317]:
sns.set(font_scale=0.8)
In [1318]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [1319]:
df.describe()
Out[1319]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 246.000000 246.000000 246.000000 246.000000 246.000000 2.460000e+02 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 246.000000 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 246.0 2.460000e+02 2.460000e+02 2.460000e+02 2.460000e+02 246.0 2.460000e+02 246.000000 246.000000 246.000000 246.000000 246.000000 246.0 246.000000 246.0 246.000000 246.000000 246.000000 246.000000
mean 202.102943 204.345590 199.198403 201.592561 201.592561 2.094423e+07 -0.207049 0.839317 32.319721 6.502616 212.941723 193.082214 203.011969 -3.808441 3.325959 6.390142 31.200753 -3.624506 -3.582274 -0.042233 -5.192805 -2.017674 -0.024894 43.425184 -0.233060 45.409688 43.950114 43.867621 -4.813950e+07 -4.653100e+06 -3.650872e+08 6.650605e+06 4.236354e+06 6.650605e+06 6.650605e+06 6.650605e+06 0.0 6.650605e+06 4.236354e+06 4.236354e+06 4.236354e+06 0.0 4.236354e+06 9.170732 673.873984 683.044715 683.044715 683.044715 0.0 683.044715 0.0 -0.207049 -0.002437 0.025307 0.025175
std 40.187126 40.248797 39.962360 40.042940 40.042940 1.263946e+07 2.702073 0.263724 39.671457 1.678172 39.477287 39.109658 38.899957 7.426823 0.942754 3.255811 21.577145 4.322615 3.780045 1.797372 15.843726 3.973893 0.082139 9.940103 0.117800 8.602399 24.208167 22.293852 1.094812e+08 1.293811e+07 2.001141e+08 8.942137e+06 5.696031e+06 8.942137e+06 8.942137e+06 8.942137e+06 0.0 8.942137e+06 5.696031e+06 5.696031e+06 5.696031e+06 0.0 5.696031e+06 9.209762 485.568983 492.925695 492.925695 492.925695 0.0 492.925695 0.0 2.702073 0.027033 0.005490 0.005462
min 113.639999 118.190002 108.699997 111.959999 111.959999 7.650300e+06 -11.127612 -0.198108 1.755223 3.256017 127.139608 110.961141 122.082857 -18.518204 1.540799 1.800003 0.695081 -11.778272 -9.969750 -3.832131 -45.939995 -12.566746 -0.272754 19.124675 -0.518539 21.514220 2.065251 3.308153 -3.207712e+08 -5.665548e+07 -8.487745e+08 1.427748e+06 9.094581e+05 1.427748e+06 1.427748e+06 1.427748e+06 0.0 1.427748e+06 9.094581e+05 9.094581e+05 9.094581e+05 0.0 9.094581e+05 0.000000 249.000000 252.000000 252.000000 252.000000 0.0 252.000000 0.0 -11.127612 -0.117969 0.014203 0.014203
25% 167.340004 169.092495 165.471249 166.862507 166.862507 1.335935e+07 -1.641180 0.682664 8.341116 5.469164 176.637416 157.830649 166.952859 -8.856619 2.710870 4.082493 14.466935 -6.710149 -6.767930 -1.106504 -15.175007 -4.199853 -0.074622 36.238061 -0.306268 39.802147 21.287765 23.050531 -1.065624e+08 -1.155190e+07 -5.232798e+08 3.029066e+06 1.929478e+06 3.029066e+06 3.029066e+06 3.029066e+06 0.0 3.029066e+06 1.929478e+06 1.929478e+06 1.929478e+06 0.0 1.929478e+06 3.000000 373.000000 377.750000 377.750000 377.750000 0.0 377.750000 0.0 -1.641180 -0.016548 0.022556 0.022556
50% 212.400002 213.945000 209.994995 211.550003 211.550003 1.722295e+07 -0.385199 0.804448 19.042795 6.164824 219.128853 206.451059 212.487859 -4.251988 3.253407 5.614998 26.945775 -4.307832 -3.879066 -0.024890 -5.349991 -2.030155 -0.024919 42.439374 -0.194787 44.963862 45.237368 45.677399 -6.170477e+07 -5.991052e+06 -3.041592e+08 4.143884e+06 2.639603e+06 4.143884e+06 4.143884e+06 4.143884e+06 0.0 4.143884e+06 2.639603e+06 2.639603e+06 2.639603e+06 0.0 2.639603e+06 6.000000 501.500000 508.500000 508.500000 508.500000 0.0 508.500000 0.0 -0.385199 -0.003859 0.025276 0.024920
75% 230.672493 235.244999 228.044998 230.889999 230.889999 2.371040e+07 1.309460 0.982070 36.798831 7.332456 242.479356 222.536910 232.933211 0.685160 3.805710 7.416000 43.807030 -0.461419 -1.030653 0.920266 5.184998 0.323749 0.023818 50.042911 -0.149140 51.273131 63.424474 62.639209 2.550618e+07 8.951113e+05 -2.012475e+08 7.026527e+06 4.475811e+06 7.026527e+06 7.026527e+06 7.026527e+06 0.0 7.026527e+06 4.475811e+06 4.475811e+06 4.475811e+06 0.0 4.475811e+06 11.000000 809.750000 818.250000 818.250000 818.250000 0.0 818.250000 0.0 1.309460 0.013010 0.028620 0.028274
max 271.000000 274.290009 270.399994 270.829987 270.829987 8.869920e+07 10.396570 1.608871 260.397922 11.844461 277.221034 263.981180 268.157144 14.507949 6.206293 21.600006 90.473821 5.110634 4.420525 4.765163 38.550018 7.487299 0.236552 68.292867 -0.049300 68.355914 90.889135 89.175297 1.625807e+08 3.474256e+07 -5.681090e+07 9.093028e+07 5.792147e+07 9.093028e+07 9.093028e+07 9.093028e+07 0.0 9.093028e+07 5.792147e+07 5.792147e+07 5.792147e+07 0.0 5.792147e+07 58.000000 2896.000000 2925.000000 2925.000000 2925.000000 0.0 2925.000000 0.0 10.396570 0.098909 0.040412 0.040412
In [1320]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [1321]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [1322]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [1329]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected BABA
In [1330]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [1331]:
df.columns
Out[1331]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [1332]:
df.shape
Out[1332]:
(311, 52)
In [1333]:
df.isnull().sum()
Out[1333]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           2
NATR                          0
TRANGE                        0
DMI                           0
MACD                         10
MACDSIGNAL                   10
MACDHIST                     10
MOM                           0
PPO                           2
ROCP                          0
RSI                           0
TRIX                         65
ULTOSC                        5
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [1334]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [1335]:
df_weekly = df.resample('W').agg('mean')
In [1336]:
df_weekly.shape
Out[1336]:
(65, 51)
In [1337]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[1337]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f0788da7950>
In [1338]:
sns.set(font_scale=0.8)
In [1339]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [1340]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 7 strongly correlated values with AvgTrueRange:
AvgTrueRange    1.000000
TRANGE          0.762966
Variance        0.610741
AD              0.587674
Upperband       0.563288
NATR            0.552508
Middleband      0.501798
Name: AvgTrueRange, dtype: float64
In [1341]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 14 strongly correlated values with NATR :
NATR                     1.000000
Volume                   0.694601
N                        0.654232
E                        0.654232
C                        0.654232
O                        0.654232
Verified_status_False    0.653819
Verified_status_True     0.570909
AvgTrueRange             0.552508
Variance                 0.513161
Beta                     0.507946
APO                     -0.503947
MACD                    -0.510858
PPO                     -0.566123
Name: NATR, dtype: float64
In [1342]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.762966
Verified_status_True         0.622357
Volume                       0.610771
Variance                     0.521570
E                            0.514159
N                            0.514159
O                            0.514159
C                            0.514159
Verified_status_False        0.510622
B5_E_Um                      0.505524
B5_C_Um                      0.505524
B5_O_Um                      0.505524
Upward_momentum_created      0.505524
B5_N_Um                      0.505524
B5_E_Dm                      0.505524
B5_C_Dm                      0.505524
B5_O_Dm                      0.505524
B5_N_Dm                      0.505524
Downward_momentum_created    0.505524
Name: TRANGE, dtype: float64
In [1343]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with Openness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999950
Volume                       0.902567
Verified_status_True         0.841219
B5_E_Um                      0.681476
B5_C_Um                      0.681476
B5_O_Um                      0.681476
Upward_momentum_created      0.681476
B5_N_Um                      0.681476
B5_N_Dm                      0.681476
B5_E_Dm                      0.681476
B5_C_Dm                      0.681476
B5_O_Dm                      0.681476
Downward_momentum_created    0.681476
NATR                         0.654232
Variance                     0.540393
TRANGE                       0.514159
Name: O, dtype: float64
In [1344]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with conscientiousness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999950
Volume                       0.902567
Verified_status_True         0.841219
B5_E_Um                      0.681476
B5_C_Um                      0.681476
B5_O_Um                      0.681476
Upward_momentum_created      0.681476
B5_N_Um                      0.681476
B5_N_Dm                      0.681476
B5_E_Dm                      0.681476
B5_C_Dm                      0.681476
B5_O_Dm                      0.681476
Downward_momentum_created    0.681476
NATR                         0.654232
Variance                     0.540393
TRANGE                       0.514159
Name: C, dtype: float64
In [1345]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with conscientiousness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999950
Volume                       0.902567
Verified_status_True         0.841219
B5_E_Um                      0.681476
B5_C_Um                      0.681476
B5_O_Um                      0.681476
Upward_momentum_created      0.681476
B5_N_Um                      0.681476
B5_N_Dm                      0.681476
B5_E_Dm                      0.681476
B5_C_Dm                      0.681476
B5_O_Dm                      0.681476
Downward_momentum_created    0.681476
NATR                         0.654232
Variance                     0.540393
TRANGE                       0.514159
Name: E, dtype: float64
In [1346]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with conscientiousness:
Series([], Name: A, dtype: float64)
In [1347]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with conscientiousness:
N                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999950
Volume                       0.902567
Verified_status_True         0.841219
B5_E_Um                      0.681476
B5_C_Um                      0.681476
B5_O_Um                      0.681476
Upward_momentum_created      0.681476
B5_N_Um                      0.681476
B5_N_Dm                      0.681476
B5_E_Dm                      0.681476
B5_C_Dm                      0.681476
B5_O_Dm                      0.681476
Downward_momentum_created    0.681476
NATR                         0.654232
Variance                     0.540393
TRANGE                       0.514159
Name: N, dtype: float64
In [1348]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_O_Um:
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_O_Um, dtype: float64
In [1349]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_C_Um:
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_C_Um, dtype: float64
In [1350]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_E_Um:
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_E_Um, dtype: float64
In [1351]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [1352]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_N_Um:
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_N_Um, dtype: float64

Downward momentum correlation

In [1353]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_O_Dm:
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_O_Dm, dtype: float64
In [1354]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_C_Dm:
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_C_Dm, dtype: float64
In [1355]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_E_Dm:
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_E_Dm, dtype: float64
In [1356]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Dm:
Series([], Name: B5_A_Dm, dtype: float64)
In [1357]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with B5_N_Dm:
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: B5_N_Dm, dtype: float64
In [1358]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Real_or_Fake_tweet :
Series([], Name: Fake_news, dtype: float64)
In [1359]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with Downward_momentum_created :
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: Downward_momentum_created, dtype: float64
In [1360]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 18 strongly correlated values with Upward_momentum_created :
B5_E_Um                      1.000000
Upward_momentum_created      1.000000
B5_O_Um                      1.000000
B5_C_Um                      1.000000
B5_N_Um                      1.000000
B5_O_Dm                      1.000000
Downward_momentum_created    1.000000
B5_N_Dm                      1.000000
B5_E_Dm                      1.000000
B5_C_Dm                      1.000000
Verified_status_True         0.825992
N                            0.681476
E                            0.681476
O                            0.681476
C                            0.681476
Verified_status_False        0.676767
Volume                       0.597634
TRANGE                       0.505524
Name: Upward_momentum_created, dtype: float64
In [1361]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
N                            0.841219
E                            0.841219
C                            0.841219
O                            0.841219
Verified_status_False        0.835769
Volume                       0.827975
B5_E_Dm                      0.825992
B5_C_Dm                      0.825992
B5_O_Dm                      0.825992
B5_N_Dm                      0.825992
B5_E_Um                      0.825992
B5_C_Um                      0.825992
B5_O_Um                      0.825992
Downward_momentum_created    0.825992
Upward_momentum_created      0.825992
B5_N_Um                      0.825992
TRANGE                       0.622357
NATR                         0.570909
Variance                     0.510300
Beta                         0.505770
Name: Verified_status_True, dtype: float64
In [1362]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 20 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
N                            0.999950
E                            0.999950
C                            0.999950
O                            0.999950
Volume                       0.901251
Verified_status_True         0.835769
B5_E_Um                      0.676767
B5_C_Um                      0.676767
B5_O_Um                      0.676767
Upward_momentum_created      0.676767
B5_N_Um                      0.676767
B5_N_Dm                      0.676767
B5_E_Dm                      0.676767
B5_C_Dm                      0.676767
B5_O_Dm                      0.676767
Downward_momentum_created    0.676767
NATR                         0.653819
Variance                     0.539335
TRANGE                       0.510622
Name: Verified_status_False, dtype: float64
In [1363]:
sns.set(font_scale=0.8)
In [1364]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [1365]:
df_weekly.fillna(0, inplace = True)
In [1366]:
df_weekly.dropna(inplace=True)
In [1367]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [1368]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();